1 Setup

Libraries

suppressPackageStartupMessages({
  library(data.table)
  library(DESeq2)
  library(gplots)
  library(here)
  library(hyperSpec)
  library(matrixStats)
  library(parallel)
  library(pander)
  library(plotly)
  library(RColorBrewer)
  library(scatterplot3d)
  library(tidyverse)
  library(tximport)
  library(vsn)
})

Helper functions

source(here("UPSCb-common/src/R/plot.multidensity.R"))
source(here("UPSCb-common/src/R/featureSelection.R"))

Graphics

#pal <- brewer.pal(8,"Dark2")
pal <- brewer.pal(10,"Paired")
hpal <- colorRampPalette(c("blue","white","red"))(100)
mar <- par("mar")

Metadata Sample information

samples <- read_csv(here("doc/samples.csv"))
## Parsed with column specification:
## cols(
##   ScilifeID = col_character(),
##   SubmittedID = col_character(),
##   Stages = col_character(),
##   Description = col_character()
## )
samples$ID <- sub(".*/","",sub("_L00[1,2]","",sub("[1,2]_[1].*_.*DXX_","", sub("_dual.*","",samples$ScilifeID))))  

samples$Batch <- factor(sprintf("B%d",grepl("P7614",samples$ScilifeID)+1))

2 Raw data

filelist <- list.files(here("data/Salmon"), 
                          recursive = TRUE, 
                          pattern = "quant.sf",
                          full.names = TRUE)

Sanity check to ensure that the data is sorted according to the sample info

filelist <- filelist[match(samples$ScilifeID,sub("_sortmerna.*","",basename(dirname(filelist))))]

stopifnot(all(match(sub("_sortmerna.*","",basename(dirname(filelist))),
                    samples$ScilifeID) == 1:nrow(samples)))

name the file list vector

names(filelist) <- samples$ID

Read the expression at the gene level

counts <- suppressMessages(round(tximport(files = filelist, type = "salmon",txOut=TRUE)$counts))

# For the PCA, Use the vst data instead and use colMedians or colMeans
 tst <- sapply(split.data.frame(t(counts),
                         samples$Stages),
         colSums)

2.1 Quality Control

Check how many genes are never expressed

sel <- rowSums(counts) == 0
sprintf("%s%% percent (%s) of %s genes are not expressed",
        round(sum(sel) * 100/ nrow(counts),digits=1),
        sum(sel),
        nrow(counts))
## [1] "5.6% percent (21573) of 387368 genes are not expressed"

Let us take a look at the sequencing depth, colouring by Stages

dat_s <- tibble(x=colnames(counts),y=colSums(counts)) %>% 
  bind_cols(samples)

ggplot(dat_s,aes(x,y,fill=samples$Stages)) + geom_col() + 
  scale_y_continuous(name="reads") +
  theme(axis.text.x=element_text(angle=90,size=4),axis.title.x=element_blank())

Display the per-gene mean expression

i.e. the mean raw count of every gene across samples is calculated and displayed on a log10 scale.

The cumulative gene coverage is as expected

ggplot(melt(log10(rowMeans(counts))),aes(x=value)) + 
  geom_density() + ggtitle("gene mean raw counts distribution") +
  scale_x_continuous(name="mean raw counts (log10)")
## Warning in melt(log10(rowMeans(counts))): The melt generic in data.table
## has been passed a numeric and will attempt to redirect to the relevant
## reshape2 method; please note that reshape2 is deprecated, and this
## redirection is now deprecated as well. To continue using melt methods from
## reshape2 while both libraries are attached, e.g. melt.list, you can prepend
## the namespace like reshape2::melt(log10(rowMeans(counts))). In the next
## version, this warning will become an error.
## Warning: Removed 21573 rows containing non-finite values (stat_density).

The same is done for the individual samples colored by Batch.

#samples$Batch <- factor(sprintf("B%d",grepl("P7614",samples$ScilifeID)+1))

dat_b <- as.data.frame(log10(counts)) %>% utils::stack() %>% 
  mutate(Batch=samples$Batch[match(ind,samples$ID)]) %>% 
  mutate(Stages=samples$Stages[match(ind,samples$ID)])

ggplot(dat_b,aes(x=values,group=ind,col=Batch)) + 
  geom_density() + ggtitle("sample raw counts distribution") +
  scale_x_continuous(name="per gene raw counts (log10)")
## Warning: Removed 17209420 rows containing non-finite values (stat_density).

ggplot(dat_b,aes(x=values,group=ind,col=Stages)) + 
  geom_density() + ggtitle("sample raw counts distribution") +
  scale_x_continuous(name="per gene raw counts (log10)")
## Warning: Removed 17209420 rows containing non-finite values (stat_density).

2.2 Export

dir.create(here("data/analysis/salmon"),showWarnings=FALSE,recursive=TRUE)
write.csv(counts,file=here("data/analysis/salmon/raw-unormalised-gene-expression_data.csv"))

3 Data normalisation

3.1 Preparation

For visualization, the data is submitted to a variance stabilization transformation using DESeq2. The dispersion is estimated independently of the sample tissue and replicate.

samples$Stages <- factor(samples$Stages)

dds <- DESeqDataSetFromMatrix(
  countData = counts,
  colData = samples,
  design = ~ Batch+Stages)
## converting counts to integer mode
save(dds,file=here("data/analysis/salmon/dds.rda"))

Check the size factors (i.e. the sequencing library size effect)

dds <- estimateSizeFactors(dds)
sizes <- sizeFactors(dds)
pander(sizes)
Table continues below
P464_201B P464_202 P464_203 P464_204 P464_205 P464_206 P464_207B
0.7277 1.521 1.491 1.364 1.049 1.373 1.167
Table continues below
P464_208 P464_209B P464_211 P7614_301_S1 P7614_301_S1 P7614_302_S2
1.268 0.5299 1.635 0.959 0.9556 0.96
Table continues below
P7614_302_S2 P7614_303_S3 P7614_303_S3 P7614_304_S4 P7614_304_S4
0.9593 1.084 1.082 1.035 1.04
Table continues below
P7614_305_S5 P7614_305_S5 P7614_306_S6 P7614_306_S6 P7614_307_S7
1.088 1.087 1.304 1.305 1.07
Table continues below
P7614_307_S7 P7614_308_S8 P7614_308_S8 P7614_309_S9 P7614_309_S9
1.07 1.137 1.146 0.8997 0.8973
Table continues below
P7614_310_S10 P7614_310_S10 P7614_311_S11 P7614_311_S11 P7614_312_S12
0.8148 0.8171 0.8993 0.9027 0.8269
Table continues below
P7614_312_S12 P7614_313_S13 P7614_313_S13 P7614_314_S14 P7614_314_S14
0.8256 0.982 0.9788 0.9207 0.924
Table continues below
P7614_315_S15 P7614_315_S15 P7614_316_S16 P7614_316_S16 P7614_317_S17
1.055 1.062 0.9558 0.9541 0.8551
Table continues below
P7614_317_S17 P7614_318_S18 P7614_318_S18 P7614_319_S19 P7614_319_S19
0.8548 0.9647 0.9629 0.9735 0.9734
Table continues below
P7614_320_S20 P7614_320_S20 P7614_321_S21 P7614_321_S21 P7614_322_S22
0.9145 0.9167 1.128 1.13 1.071
P7614_322_S22 P7614_323_S23 P7614_323_S23 P7614_324_S24 P7614_324_S24
1.075 0.9609 0.9636 0.991 0.9911
boxplot(sizes, main="Sequencing libraries size factor")

3.2 Variance Stabilising Transformation

vsd <- varianceStabilizingTransformation(dds, blind=TRUE)
vst <- assay(vsd)
vst <- vst - min(vst)

dir.create(here("data/analysis/DE"))
## Warning in dir.create(here("data/analysis/DE")): '/mnt/picea/home/ccanovi/
## Git/lncRNAs/data/analysis/DE' already exists
save(vst,file=here("data/analysis/DE/vst-blind.rda"))

vsda <- varianceStabilizingTransformation(dds, blind=FALSE)
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
##    function: y = a/x + b, and a local regression fit was automatically substituted.
##    specify fitType='local' or 'mean' to avoid this message next time.
vsta <- assay(vsda)
vsta <- vsta - min(vsta)
save(vsta,file=here("data/analysis/DE/vst-aware.rda"))

Validation

The variance stabilisation worked adequately

meanSdPlot(vst[rowSums(vst)>0,])

3.3 QC on the normalised data

3.3.1 PCA

pc <- prcomp(t(vst))
percent <- round(summary(pc)$importance[2,]*100)

Cumulative components effect

We define the number of variable of the model

nvar=2

nlevel=nlevels(dds$Stages) * nlevels(dds$Batch)

We plot the percentage explained by the different components, the red line represent the number of variable in the model, the orange line the number of variable combinations.

ggplot(tibble(x=1:length(percent),y=cumsum(percent)),aes(x=x,y=y)) +
  geom_line() + scale_y_continuous("variance explained (%)",limits=c(0,100)) +
  scale_x_continuous("Principal component") + 
  geom_vline(xintercept=nvar,colour="red",linetype="dashed",size=0.5) + 
  geom_hline(yintercept=cumsum(percent)[nvar],colour="red",linetype="dashed",size=0.5) +
  geom_vline(xintercept=nlevel,colour="orange",linetype="dashed",size=0.5) + 
  geom_hline(yintercept=cumsum(percent)[nlevel],colour="orange",linetype="dashed",size=0.5)

3.3.2 3 first dimensions

mar=c(5.1,4.1,4.1,2.1)

The PCA shows that a large fraction of the variance is explained by both variables.

scatterplot3d(pc$x[,1],
              pc$x[,2],
              pc$x[,3],
              xlab=paste("Comp. 1 (",percent[1],"%)",sep=""),
              ylab=paste("Comp. 2 (",percent[2],"%)",sep=""),
              zlab=paste("Comp. 3 (",percent[3],"%)",sep=""),
              color=pal[as.integer(dds$Stages)],
              pch=c(17,19)[as.integer(dds$Batch)])
legend("topleft",
       fill=pal[1:nlevels(dds$Stages)],
       legend=levels(dds$Stages))

legend("topright",
       pch=c(17,19),
       legend=levels(dds$Batch))

par(mar=mar)

3.3.3 2D

pc.dat <- cbind(pc$x,samples)

p <- ggplot(pc.dat,aes(x=PC1,y=PC2,col=dds$Stages,shape=dds$Batch,text=dds$ID)) + 
  geom_point(size=2) + 
  ggtitle("Principal Component Analysis",subtitle="variance stabilized counts")

ggplotly(p) %>% 
  layout(xaxis=list(title=paste("PC1 (",percent[1],"%)",sep="")),
         yaxis=list(title=paste("PC2 (",percent[2],"%)",sep="")))
p <- ggplot(pc.dat,aes(x=PC2,y=PC3,col=dds$Stages,shape=dds$Batch,text=dds$ID)) + 
  geom_point(size=2) + 
  ggtitle("Principal Component Analysis",subtitle="variance stabilized counts")

ggplotly(p) %>% 
  layout(xaxis=list(title=paste("PC2 (",percent[1],"%)",sep="")),
         yaxis=list(title=paste("PC3 (",percent[2],"%)",sep="")))
p <- ggplot(pc.dat,aes(x=PC1,y=PC3,col=dds$Stages,shape=dds$Batch,text=dds$ID)) + 
  geom_point(size=2) + 
  ggtitle("Principal Component Analysis",subtitle="variance stabilized counts")

ggplotly(p) %>% 
  layout(xaxis=list(title=paste("PC1 (",percent[1],"%)",sep="")),
         yaxis=list(title=paste("PC3 (",percent[2],"%)",sep="")))
p <- ggplot(pc.dat,aes(x=PC1,y=PC4,col=dds$Stages,shape=dds$Batch,text=dds$ID)) + 
  geom_point(size=2) + 
  ggtitle("Principal Component Analysis",subtitle="variance stabilized counts")

ggplotly(p) %>% 
  layout(xaxis=list(title=paste("PC1 (",percent[1],"%)",sep="")),
         yaxis=list(title=paste("PC4 (",percent[2],"%)",sep="")))
p <- ggplot(pc.dat,aes(x=PC2,y=PC4,col=dds$Stages,shape=dds$Batch,text=dds$ID)) + 
  geom_point(size=2) + 
  ggtitle("Principal Component Analysis",subtitle="variance stabilized counts")

ggplotly(p) %>% 
  layout(xaxis=list(title=paste("PC2 (",percent[1],"%)",sep="")),
         yaxis=list(title=paste("PC4 (",percent[2],"%)",sep="")))

3.3.4 Heatmap

Filter for noise

conds <- factor(paste(samples$Stages,samples$Batch))
sels <- rangeFeatureSelect(counts=vst,
                           conditions=conds,
                           nrep=3)
## Warning in xy.coords(x, y, xlabel, ylabel, log): 1 y value <= 0 omitted
## from logarithmic plot

vst.cutoff <- 10

Heatmap of “all” genes

hm <- heatmap.2(t(scale(t(vst[sels[[vst.cutoff+1]],]))),
          distfun=pearson.dist,
          hclustfun=function(X){hclust(X,method="ward.D2")},
          labRow = NA,trace = "none",
          labCol = conds,
          col=hpal)

plot(as.hclust(hm$colDendrogram),xlab="",sub="")

3.4 Conclusion

CHANGEME

4 Session Info

## R version 3.6.1 (2019-07-05)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.3 LTS
## 
## Matrix products: default
## BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/libopenblasp-r0.2.20.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
##  [1] grid      parallel  stats4    stats     graphics  grDevices utils    
##  [8] datasets  methods   base     
## 
## other attached packages:
##  [1] vsn_3.52.0                  tximport_1.12.3            
##  [3] forcats_0.4.0               stringr_1.4.0              
##  [5] dplyr_0.8.3                 purrr_0.3.3                
##  [7] readr_1.3.1                 tidyr_1.0.0                
##  [9] tibble_2.1.3                tidyverse_1.2.1            
## [11] scatterplot3d_0.3-41        RColorBrewer_1.1-2         
## [13] plotly_4.9.0                pander_0.6.3               
## [15] hyperSpec_0.99-20180627     ggplot2_3.2.1              
## [17] lattice_0.20-38             here_0.1                   
## [19] gplots_3.0.1.1              DESeq2_1.24.0              
## [21] SummarizedExperiment_1.14.1 DelayedArray_0.10.0        
## [23] BiocParallel_1.18.1         matrixStats_0.55.0         
## [25] Biobase_2.44.0              GenomicRanges_1.36.1       
## [27] GenomeInfoDb_1.20.0         IRanges_2.18.3             
## [29] S4Vectors_0.22.1            BiocGenerics_0.30.0        
## [31] data.table_1.12.6          
## 
## loaded via a namespace (and not attached):
##   [1] colorspace_1.4-1       rprojroot_1.3-2        htmlTable_1.13.2      
##   [4] XVector_0.24.0         base64enc_0.1-3        rstudioapi_0.10       
##   [7] hexbin_1.27.3          affyio_1.54.0          bit64_0.9-7           
##  [10] AnnotationDbi_1.46.1   lubridate_1.7.4        xml2_1.2.2            
##  [13] splines_3.6.1          geneplotter_1.62.0     knitr_1.25            
##  [16] zeallot_0.1.0          Formula_1.2-3          jsonlite_1.6          
##  [19] broom_0.5.2            annotate_1.62.0        cluster_2.1.0         
##  [22] shiny_1.4.0            BiocManager_1.30.9     compiler_3.6.1        
##  [25] httr_1.4.1             backports_1.1.5        fastmap_1.0.1         
##  [28] assertthat_0.2.1       Matrix_1.2-17          lazyeval_0.2.2        
##  [31] limma_3.40.6           cli_1.1.0              later_1.0.0           
##  [34] acepack_1.4.1          htmltools_0.4.0        tools_3.6.1           
##  [37] affy_1.62.0            gtable_0.3.0           glue_1.3.1            
##  [40] GenomeInfoDbData_1.2.1 reshape2_1.4.3         Rcpp_1.0.2            
##  [43] cellranger_1.1.0       vctrs_0.2.0            preprocessCore_1.46.0 
##  [46] gdata_2.18.0           nlme_3.1-141           crosstalk_1.0.0       
##  [49] xfun_0.10              testthat_2.2.1         rvest_0.3.4           
##  [52] mime_0.7               lifecycle_0.1.0        gtools_3.8.1          
##  [55] XML_3.98-1.20          zlibbioc_1.30.0        scales_1.0.0          
##  [58] promises_1.1.0         hms_0.5.1              yaml_2.2.0            
##  [61] memoise_1.1.0          gridExtra_2.3          rpart_4.1-15          
##  [64] latticeExtra_0.6-28    stringi_1.4.3          RSQLite_2.1.2         
##  [67] highr_0.8              genefilter_1.66.0      checkmate_1.9.4       
##  [70] caTools_1.17.1.2       rlang_0.4.1            pkgconfig_2.0.3       
##  [73] bitops_1.0-6           evaluate_0.14          labeling_0.3          
##  [76] htmlwidgets_1.5.1      bit_1.1-14             tidyselect_0.2.5      
##  [79] plyr_1.8.4             magrittr_1.5           R6_2.4.0              
##  [82] generics_0.0.2         Hmisc_4.2-0            DBI_1.0.0             
##  [85] pillar_1.4.2           haven_2.1.1            foreign_0.8-72        
##  [88] withr_2.1.2            survival_2.44-1.1      RCurl_1.95-4.12       
##  [91] nnet_7.3-12            modelr_0.1.5           crayon_1.3.4          
##  [94] KernSmooth_2.23-16     rmarkdown_1.16         locfit_1.5-9.1        
##  [97] readxl_1.3.1           blob_1.2.0             digest_0.6.22         
## [100] xtable_1.8-4           httpuv_1.5.2           munsell_0.5.0         
## [103] viridisLite_0.3.0